Setup¶

In [ ]:
""" Run the latest and greatest """
%pip install --upgrade pip
%pip install -r requirements.txt --upgrade
In [ ]:
from bs4 import BeautifulSoup
from collections import Counter
import html5lib
import itertools
import json
import math
import matplotlib as mpl
import matplotlib.pyplot as plt
import numpy as np
import os
import pandas as pd
import re
import requests
import seaborn as sns
import shifterator as sh
import string
from string import punctuation as punct
import sys
# import tensorflow as tf

print('Running the following Package Versions:')
print('---------------------------------------')

print(f'Matplotlib      V.{mpl.__version__}')
print(f'NumPy           V.{np.__version__}')
print(f'Pandas          V.{pd.__version__}')
print(f'Seaborn         V.{sns.__version__}')
# print(f'TensorFlow      V.{str(tf.__version__)[:6]}')

if not sys.warnoptions:
    import warnings
    warnings.simplefilter("ignore")

""" In case Computer Modern Roman size 10 isn't an option. """
# mpl.font_manager.findSystemFonts(fontpaths = None, fontext = 'ttf')

""" Set default font for plots. """
mpl.rc('font',
       family = 'serif',
       serif = 'cmr10')
mpl.rc('mathtext',
       fontset = 'cm')

%matplotlib inline
Running the following Package Versions:
---------------------------------------
Matplotlib      V.3.5.3
NumPy           V.1.21.6
Pandas          V.1.1.5
Seaborn         V.0.12.2

Functions¶

In [ ]:
def load_text(f: str = '/content/text.txt',
              prnt: bool = True):
  with open(f) as txt:
    gram_list = re.findall(r"\w+[^\s\.,?!;]+|\w+|[\.,?!;]",
                           ''.join(txt.readlines()))
    if prnt:
      print(f"Total n-grams: {len(np.array(gram_list)):>14,}")
  return np.array(gram_list)

def get_word_freq(f: str = '/content/text.txt',
                  prnt: bool = True):
  n_gram_vec = load_text(f)
  freq = np.array(np.unique(n_gram_vec, return_counts=True, axis=None))
  if prnt:
    print(f'Total unique n-grams: {len(np.array(freq[1])):>7,}')
  return freq, n_gram_vec

def plot_zipf(f: str = '/content/text.txt'):
  freq = load_text(f)
  plt.scatter(x = np.linspace(len(freq),1,len(freq)), 
              y = freq,
              s = 2,
              c = 'k',
              alpha = 0.8,
              linewidths=0.)
  plt.xscale('log')
  plt.yscale('log')
  plt.suptitle('Prior',
              fontsize = 16)
  plt.title('Rank-Frequency (Zipf) Plot')
  plt.xlabel(r'log$_{10}$(Rank)')
  plt.ylabel(r'log$_{10}$(Frequency)')
  sns.despine()
  plt.show()

def subdue_ts_df(df, dh):
  return pd.DataFrame([i for i in df if i <= df.mean() - dh or i <= df.mean() + dh]).interpolate(method='linear')

def lens_ts_df(df, w):
   return pd.DataFrame(df.rolling(int(w)).sum() / w).interpolate(method='linear')

def gen_ts_df(source):
  ts = np.char.lower(load_text(source, False))
  sentiment_lmt1 = pd.read_table('/Users/patrick/Documents/csys303_assignments/violent_crimes_analysis/data/nlp/ref/dataset_s1.tsv')
  sentiment_lmt1.set_index(sentiment_lmt1['word'], inplace = True)
  ind_list = [sentiment_lmt1.index.get_loc(word) for word in ts if word in sentiment_lmt1.index]
  return sentiment_lmt1.iloc[ind_list]['happiness_average']

def gen_ts_windows(ts_df: pd.core.frame.DataFrame = None,
                   dh: float = 0.,
                   t: list = None, 
                   fixed_y: bool = False, 
                   show_plot: bool = False, 
                   ax = None):
  if t is None:
    t = [10**1, 10**1.5, 10**2, 10**2.5, 10**3, 10**3.5, 10**4]

  if ax is None:
    fig, axs = plt.subplots(7, 1, figsize = (3.5, 9), dpi = 180)

  else:
    return overplot(ts_df, t, ax, fixed_y)

  for i, ax in enumerate(axs):
      temp = ts_df
      temp = lens_ts_df(ts_df, t[i])
      temp.plot(ax = ax,
                alpha = 2/3,
                linewidth = 1/3,
                color='maroon',
                legend = False)
      ax.set_title(r'$\delta_h=0; T=$'+f'{int(t[i])}',
                      fontsize = 7)

      n = len(temp)
      ticks = [int(i) for i in np.linspace(1, n, 6)]
      ax.set_xticks(ticks)
      ax.set_xlabel('')
      if fixed_y:
        ax.set_yticks(np.linspace(3.7, 6.1, 4))
        ax.set_yticklabels(labels = np.linspace(3.7, 6.1, 4),fontsize = 6)

      if i == 6:
          ax.set_xticklabels([], fontsize = 6)
          ax.set_xlabel('Word Index', fontsize = 6)

  if show_plot:
    plt.suptitle('Average Lexical Happiness',
                 fontsize = 10)
    plt.tight_layout()
    plt.savefig(fname = f'bm_T{int(t[i])}.jpg')
    plt.show()
  return axs

def overplot(ts_df, t, ax, fixed_y):
  temp = ts_df
  temp = lens_ts_df(ts_df, t)
  temp.plot(ax = ax,
            alpha = 2/3,
            linewidth = 1/3,
            color='maroon',
            legend = False)
  ax.set_title(r'$\delta_h=0; T=$'+f'{int(t)}',
                  fontsize = 7)

  n = len(temp)
  ticks = [int(i) for i in np.linspace(1, n, 6)]
  ax.set_xticks(ticks)
  ax.set_xlabel('')
  if fixed_y:
    ax.set_yticks(np.linspace(3.7, 6.1, 4))
    ax.set_yticklabels(labels = np.linspace(3.7, 6.1, 4),fontsize = 6)
  return ax

def gen_ts_delta_h(ts_df: pd.core.frame.DataFrame = None, 
                   dh: list = None, 
                   t: float = 10**3.5, 
                   fixed_y: bool = False, 
                   show_plot: bool = False, 
                   ax = None):
  if dh is None:
    dh = [0.5, 1.0, 1.5, 2.0, 2.5, 3.0, 3.5]

  if ax is None:
    fig, axs = plt.subplots(7, 1, figsize = (3.5, 9), dpi = 180)

  else:
    for i in dh:
      temp = ts_df
      temp = subdue_ts_df(ts_df, i)
      temp = lens_ts_df(ts_df, t)
      ax.plot(np.linspace(0, len(ts_df), len(temp)),
              temp.values,
              alpha = 2/3,
              linewidth = 1/3)
    return ax
  
  for i, ax in enumerate(axs):
      temp = subdue_ts_df(ts_df, dh[i])
      temp = lens_ts_df(temp, t)
      temp.plot(ax = ax,
                alpha = 2/3,
                linewidth = 1/3,
                color='maroon',
                legend = False)
      ax.set_title(r'$\delta_h=$'+f'{dh[i]}; '+r'$T=$'+f'{int(t)}',
                   fontsize = 7)

      n = len(temp)
      ticks = [int(i) for i in np.linspace(1, n, 6)]
      ax.set_xticks(ticks)
      
      if fixed_y:
        ax.set_yticks([5.2,5.3,5.4])
        ax.set_yticklabels(labels = [5.2,5.3,5.4],fontsize = 6)

      if i != 3.5:
          ax.set_xticklabels([], fontsize = 6)

  if show_plot:
    plt.suptitle('Average Lexical Happiness',
                 fontsize = 10)
    plt.tight_layout()
    plt.savefig(fname = f'bm_T{int(t[i])}.jpg')
    plt.show()
  return axs

def stacked_plot(ts_df, 
                 t, 
                 show_plot = False):
    re_df = pd.DataFrame()
    dh = np.arange(0.5,4,0.5)
    for i in dh:
        temp = subdue_ts_df(ts_df, i)
        temp = lens_ts_df(temp, t)
        re_df = pd.concat(
           (re_df, temp.rename(columns={0:i})),
           axis=1,
           ignore_index=True,
           keys=list(re_df.columns).append(i),
           names=['delta_h', None])
    if show_plot:
        fig, ax = plt.subplots(
            1,1,
            sharey=True,
            figsize=(7.5, 3.5),
            dpi=180)
        re_df.interpolate(method='linear').plot(
            ax=ax,
            alpha=0.8,
            linewidth=0.5,
            colormap='gnuplot',
            marker='.')
        fig.show()
    return re_df

def gen_shift_args(wf_1 : dict = None, 
                   wf_2 : dict = None, 
                   raise_err : bool = False, 
                   prop_shift : bool = False, 
                   set_ref : bool = False, 
                   lens: list = None,
                   source: str = None):
    
  if lens is None:
    lens = [(3,7)]

  if raise_err or prop_shift:
      raise_err = 'error'
      types = set(wf_1.keys()).union(wf_2.keys())
      for t in types:
          if t not in wf_1:
              wf_1[t] = 0
          elif t not in wf_2:
              wf_2[t] = 0

  else:
     raise_err = 'exclude'

  if set_ref:
     ref = np.mean(subdue_ts_df(gen_ts_df(source = source), lens[0][0]-lens[0][1])[0])

  else:
     ref = 0

  print("Lensing Window: [{}, {}]".format(lens[0][0], lens[0][1]))
  print("Reference Happiness: {:4.3f}".format(ref))

  wa_args = { 
      'type2freq_1' : wf_1,
      'type2freq_2' : wf_2,
      'type2score_1' :  'labMT_English',
      'type2score_2' : 'labMT_English',
      'reference_value' : ref,
      'handle_missing_scores' : raise_err,
      'stop_lens' : lens,
      'stop_words' : None,
      'normalization' : "variation",
      'source' : source
  }

  ps_args = {
      'type2freq_1' : wf_1,
      'type2freq_2' : wf_2,
      'type2score_1' :  'labMT_English',
      'type2score_2' : 'labMT_English',
      'reference_value' : ref,
      'handle_missing_scores' : raise_err,
      'stop_lens' : lens,
      'stop_words' : None,
      'normalization' : "variation"
  }

  en_args = {
      'type2freq_1' : wf_1,
      'type2freq_2' : wf_2,
      'base' : 2,
      'alpha' : 1,
      'reference_value' : ref,
      'normalization' : "variation",
      'source' : source
  }

  kl_args = {
      'type2freq_1' : wf_1,
      'type2freq_2' : wf_2,
      'base' : 2,
      'reference_value' : ref,
      'normalization' : "variation",
  }

  js_args = {
      'type2freq_1' : wf_1,
      'type2freq_2' : wf_2,
      'base' : 2,
      'weight_1' : 0.5,
      'weight_2' : 0.5,
      'alpha' : 1,
      'reference_value' : ref,
      'normalization' : "variation",
  }

  return {'weighted_average' : wa_args,
          'proportion_shift' : ps_args, 
          'entropy_shift' : en_args, 
          'kl_shift' : kl_args, 
          'jsd_shift' : js_args}

##########################################################################################
#                                     TEST  FUNCTIONS                                    #
##########################################################################################

def test_plot(ts_df: pd.core.frame.DataFrame = None):
  fig, ax = plt.subplots(1, 1, figsize=(20,5), dpi=250)
  ts_df.plot(linewidth=0.05, alpha = 0.95, ax = ax)
  ax.set_xticklabels([int(i) for i in np.linspace(1,len(ts_df), 8)])
  plt.show()

def test_str(n_gram: np.ndarray = None,
             prnt: bool = True):
  test_str = ' '.join(n_gram)
  if prnt:
    print(test_str)
  elif test_str:
    print("Non-zero string created.")
  else:
    print('You probably messed up. Try again!')

def test_window(df):
    return df.applymap(lambda x: df.rolling(int(x).sum())/x)

def test_lens(df, dh):
    return df.applymap(lambda x: x if df[x] >= df.mean() - dh and df[x] <= df.mean() + dh else np.nan)

##########################################################################################
#                                                                                        #
##########################################################################################

Generate Word Shift¶

Used to convert csv to txt¶

In [ ]:
# path = '/Users/patrick/Documents/csys303_assignments/violent_crimes_analysis/transcript/transcripts/'

# for f in os.listdir(path):
#     tmp = pd.read_csv(path + str(f))
#     str_txt = str(tmp['Text'].values)
#     f = f.replace('csv','txt')
#     with open(f, 'w') as out:
#         out.write(str_txt)
#         print(f'Written to: {path}{out}')
Written to: /Users/patrick/Documents/csys303_assignments/violent_crimes_analysis/transcript/transcripts/<_io.TextIOWrapper name='holmes.txt' mode='w' encoding='UTF-8'>
Written to: /Users/patrick/Documents/csys303_assignments/violent_crimes_analysis/transcript/transcripts/<_io.TextIOWrapper name='woodham.txt' mode='w' encoding='UTF-8'>
Written to: /Users/patrick/Documents/csys303_assignments/violent_crimes_analysis/transcript/transcripts/<_io.TextIOWrapper name='romano.txt' mode='w' encoding='UTF-8'>
Written to: /Users/patrick/Documents/csys303_assignments/violent_crimes_analysis/transcript/transcripts/<_io.TextIOWrapper name='bartley.txt' mode='w' encoding='UTF-8'>
Written to: /Users/patrick/Documents/csys303_assignments/violent_crimes_analysis/transcript/transcripts/<_io.TextIOWrapper name='ramsey.txt' mode='w' encoding='UTF-8'>
Written to: /Users/patrick/Documents/csys303_assignments/violent_crimes_analysis/transcript/transcripts/<_io.TextIOWrapper name='schooland.txt' mode='w' encoding='UTF-8'>
Written to: /Users/patrick/Documents/csys303_assignments/violent_crimes_analysis/transcript/transcripts/<_io.TextIOWrapper name='castillo_08_21_09.txt' mode='w' encoding='UTF-8'>
Written to: /Users/patrick/Documents/csys303_assignments/violent_crimes_analysis/transcript/transcripts/<_io.TextIOWrapper name='parkland.txt' mode='w' encoding='UTF-8'>
Written to: /Users/patrick/Documents/csys303_assignments/violent_crimes_analysis/transcript/transcripts/<_io.TextIOWrapper name='forest.txt' mode='w' encoding='UTF-8'>
Written to: /Users/patrick/Documents/csys303_assignments/violent_crimes_analysis/transcript/transcripts/<_io.TextIOWrapper name='crumbley.txt' mode='w' encoding='UTF-8'>

Plot Word Shifts¶

In [ ]:
import matplotlib as mpl
mpl.rcParams.update(mpl.rcParamsDefault)

extension = '/Users/patrick/Documents/csys303_assignments/violent_crimes_analysis/data/nlp/txt/'
list_txt = os.listdir(extension)
pairwise_txts = itertools.combinations(list_txt, 2)

for i in pairwise_txts:
    first_txt: str = f'{extension}{i[0]}'
    second_txt: str = f'{extension}{i[1]}'

    print('Sample 1:')
    first_wf, first_ng = get_word_freq(first_txt)
    first_ts_df = gen_ts_df(first_txt)

    print('\nSample 2:')
    second_wf, second_ng = get_word_freq(second_txt)
    second_ts_df = gen_ts_df(second_txt)

    first_wf_dict = {i[0] : int(i[1]) for i in zip(first_wf[0,:], first_wf[1,:])}
    second_wf_dict = {i[0] : int(i[1]) for i in zip(second_wf[0,:], second_wf[1,:])}

    args = gen_shift_args(
        first_wf_dict, 
        second_wf_dict,
        set_ref = True,
        lens=[(4,6)],
        source = first_txt
    )

    source = args['entropy_shift'].pop('source')
    source = args['weighted_average'].pop('source')

    fig = plt.figure(1, figsize = (15,10), layout = 'tight')

    sent_shift_was = sh.WeightedAvgShift(**args['weighted_average'])

    sent_shift_ent = sh.EntropyShift(**args['entropy_shift'])

    ax1 = fig.add_subplot(121)
    sent_shift_was.get_shift_graph(
        ax = ax1,
        detailed = False,
        show_plot = False,
        text_size_inset = False,
        cumulative_inset = False,
        system_names = ['Sample 1', 'Sample 2'])

    ax2 = fig.add_subplot(122)
    sent_shift_ent.get_shift_graph(
        ax = ax2,
        detailed = False,
        show_plot = False,
        text_size_inset = False,
        cumulative_inset = False,
        system_names = ['Sample 1', 'Sample 2'])

    fig.suptitle('Weighted Average vs. Entropy Shifts\nLens: [4, 6]', 
                fontsize = 20,
                x = 0.5,
                y = 1.025)

    plt.show()
Sample 1:
Total n-grams:          1,720
Total unique n-grams:     453

Sample 2:
Total n-grams:            170
Total unique n-grams:     111
Lensing Window: [4, 6]
Reference Happiness: 5.264
Sample 1:
Total n-grams:          1,720
Total unique n-grams:     453

Sample 2:
Total n-grams:            925
Total unique n-grams:     339
Lensing Window: [4, 6]
Reference Happiness: 5.264
Sample 1:
Total n-grams:          1,720
Total unique n-grams:     453

Sample 2:
Total n-grams:            358
Total unique n-grams:     201
Lensing Window: [4, 6]
Reference Happiness: 5.264
Sample 1:
Total n-grams:          1,720
Total unique n-grams:     453

Sample 2:
Total n-grams:            223
Total unique n-grams:     143
Lensing Window: [4, 6]
Reference Happiness: 5.264
Sample 1:
Total n-grams:          1,720
Total unique n-grams:     453

Sample 2:
Total n-grams:          3,063
Total unique n-grams:     641
Lensing Window: [4, 6]
Reference Happiness: 5.264
Sample 1:
Total n-grams:          1,720
Total unique n-grams:     453

Sample 2:
Total n-grams:            445
Total unique n-grams:     175
Lensing Window: [4, 6]
Reference Happiness: 5.264
Sample 1:
Total n-grams:          1,720
Total unique n-grams:     453

Sample 2:
Total n-grams:          1,509
Total unique n-grams:     415
Lensing Window: [4, 6]
Reference Happiness: 5.264
Sample 1:
Total n-grams:          1,720
Total unique n-grams:     453

Sample 2:
Total n-grams:            172
Total unique n-grams:     104
Lensing Window: [4, 6]
Reference Happiness: 5.264
Sample 1:
Total n-grams:          1,720
Total unique n-grams:     453

Sample 2:
Total n-grams:          1,051
Total unique n-grams:     320
Lensing Window: [4, 6]
Reference Happiness: 5.264
Sample 1:
Total n-grams:          1,720
Total unique n-grams:     453

Sample 2:
Total n-grams:            787
Total unique n-grams:     300
Lensing Window: [4, 6]
Reference Happiness: 5.264
Sample 1:
Total n-grams:          1,720
Total unique n-grams:     453

Sample 2:
Total n-grams:            146
Total unique n-grams:      93
Lensing Window: [4, 6]
Reference Happiness: 5.264
Sample 1:
Total n-grams:          1,720
Total unique n-grams:     453

Sample 2:
Total n-grams:            266
Total unique n-grams:     113
Lensing Window: [4, 6]
Reference Happiness: 5.264
Sample 1:
Total n-grams:          1,720
Total unique n-grams:     453

Sample 2:
Total n-grams:          8,551
Total unique n-grams:   1,314
Lensing Window: [4, 6]
Reference Happiness: 5.264
Sample 1:
Total n-grams:            170
Total unique n-grams:     111

Sample 2:
Total n-grams:            925
Total unique n-grams:     339
Lensing Window: [4, 6]
Reference Happiness: 5.474
Sample 1:
Total n-grams:            170
Total unique n-grams:     111

Sample 2:
Total n-grams:            358
Total unique n-grams:     201
Lensing Window: [4, 6]
Reference Happiness: 5.474
Sample 1:
Total n-grams:            170
Total unique n-grams:     111

Sample 2:
Total n-grams:            223
Total unique n-grams:     143
Lensing Window: [4, 6]
Reference Happiness: 5.474
Sample 1:
Total n-grams:            170
Total unique n-grams:     111

Sample 2:
Total n-grams:          3,063
Total unique n-grams:     641
Lensing Window: [4, 6]
Reference Happiness: 5.474
Sample 1:
Total n-grams:            170
Total unique n-grams:     111

Sample 2:
Total n-grams:            445
Total unique n-grams:     175
Lensing Window: [4, 6]
Reference Happiness: 5.474
Sample 1:
Total n-grams:            170
Total unique n-grams:     111

Sample 2:
Total n-grams:          1,509
Total unique n-grams:     415
Lensing Window: [4, 6]
Reference Happiness: 5.474
Sample 1:
Total n-grams:            170
Total unique n-grams:     111

Sample 2:
Total n-grams:            172
Total unique n-grams:     104
Lensing Window: [4, 6]
Reference Happiness: 5.474
Sample 1:
Total n-grams:            170
Total unique n-grams:     111

Sample 2:
Total n-grams:          1,051
Total unique n-grams:     320
Lensing Window: [4, 6]
Reference Happiness: 5.474
Sample 1:
Total n-grams:            170
Total unique n-grams:     111

Sample 2:
Total n-grams:            787
Total unique n-grams:     300
Lensing Window: [4, 6]
Reference Happiness: 5.474
Sample 1:
Total n-grams:            170
Total unique n-grams:     111

Sample 2:
Total n-grams:            146
Total unique n-grams:      93
Lensing Window: [4, 6]
Reference Happiness: 5.474
Sample 1:
Total n-grams:            170
Total unique n-grams:     111

Sample 2:
Total n-grams:            266
Total unique n-grams:     113
Lensing Window: [4, 6]
Reference Happiness: 5.474
Sample 1:
Total n-grams:            170
Total unique n-grams:     111

Sample 2:
Total n-grams:          8,551
Total unique n-grams:   1,314
Lensing Window: [4, 6]
Reference Happiness: 5.474
Sample 1:
Total n-grams:            925
Total unique n-grams:     339

Sample 2:
Total n-grams:            358
Total unique n-grams:     201
Lensing Window: [4, 6]
Reference Happiness: 5.031
Sample 1:
Total n-grams:            925
Total unique n-grams:     339

Sample 2:
Total n-grams:            223
Total unique n-grams:     143
Lensing Window: [4, 6]
Reference Happiness: 5.031
Sample 1:
Total n-grams:            925
Total unique n-grams:     339

Sample 2:
Total n-grams:          3,063
Total unique n-grams:     641
Lensing Window: [4, 6]
Reference Happiness: 5.031
Sample 1:
Total n-grams:            925
Total unique n-grams:     339

Sample 2:
Total n-grams:            445
Total unique n-grams:     175
Lensing Window: [4, 6]
Reference Happiness: 5.031
Sample 1:
Total n-grams:            925
Total unique n-grams:     339

Sample 2:
Total n-grams:          1,509
Total unique n-grams:     415
Lensing Window: [4, 6]
Reference Happiness: 5.031
Sample 1:
Total n-grams:            925
Total unique n-grams:     339

Sample 2:
Total n-grams:            172
Total unique n-grams:     104
Lensing Window: [4, 6]
Reference Happiness: 5.031
Sample 1:
Total n-grams:            925
Total unique n-grams:     339

Sample 2:
Total n-grams:          1,051
Total unique n-grams:     320
Lensing Window: [4, 6]
Reference Happiness: 5.031
Sample 1:
Total n-grams:            925
Total unique n-grams:     339

Sample 2:
Total n-grams:            787
Total unique n-grams:     300
Lensing Window: [4, 6]
Reference Happiness: 5.031
Sample 1:
Total n-grams:            925
Total unique n-grams:     339

Sample 2:
Total n-grams:            146
Total unique n-grams:      93
Lensing Window: [4, 6]
Reference Happiness: 5.031
Sample 1:
Total n-grams:            925
Total unique n-grams:     339

Sample 2:
Total n-grams:            266
Total unique n-grams:     113
Lensing Window: [4, 6]
Reference Happiness: 5.031
Sample 1:
Total n-grams:            925
Total unique n-grams:     339

Sample 2:
Total n-grams:          8,551
Total unique n-grams:   1,314
Lensing Window: [4, 6]
Reference Happiness: 5.031
Sample 1:
Total n-grams:            358
Total unique n-grams:     201

Sample 2:
Total n-grams:            223
Total unique n-grams:     143
Lensing Window: [4, 6]
Reference Happiness: 5.138
Sample 1:
Total n-grams:            358
Total unique n-grams:     201

Sample 2:
Total n-grams:          3,063
Total unique n-grams:     641
Lensing Window: [4, 6]
Reference Happiness: 5.138
Sample 1:
Total n-grams:            358
Total unique n-grams:     201

Sample 2:
Total n-grams:            445
Total unique n-grams:     175
Lensing Window: [4, 6]
Reference Happiness: 5.138
Sample 1:
Total n-grams:            358
Total unique n-grams:     201

Sample 2:
Total n-grams:          1,509
Total unique n-grams:     415
Lensing Window: [4, 6]
Reference Happiness: 5.138
Sample 1:
Total n-grams:            358
Total unique n-grams:     201

Sample 2:
Total n-grams:            172
Total unique n-grams:     104
Lensing Window: [4, 6]
Reference Happiness: 5.138
Sample 1:
Total n-grams:            358
Total unique n-grams:     201

Sample 2:
Total n-grams:          1,051
Total unique n-grams:     320
Lensing Window: [4, 6]
Reference Happiness: 5.138
Sample 1:
Total n-grams:            358
Total unique n-grams:     201

Sample 2:
Total n-grams:            787
Total unique n-grams:     300
Lensing Window: [4, 6]
Reference Happiness: 5.138
Sample 1:
Total n-grams:            358
Total unique n-grams:     201

Sample 2:
Total n-grams:            146
Total unique n-grams:      93
Lensing Window: [4, 6]
Reference Happiness: 5.138
Sample 1:
Total n-grams:            358
Total unique n-grams:     201

Sample 2:
Total n-grams:            266
Total unique n-grams:     113
Lensing Window: [4, 6]
Reference Happiness: 5.138
Sample 1:
Total n-grams:            358
Total unique n-grams:     201

Sample 2:
Total n-grams:          8,551
Total unique n-grams:   1,314
Lensing Window: [4, 6]
Reference Happiness: 5.138
Sample 1:
Total n-grams:            223
Total unique n-grams:     143

Sample 2:
Total n-grams:          3,063
Total unique n-grams:     641
Lensing Window: [4, 6]
Reference Happiness: 5.354
Sample 1:
Total n-grams:            223
Total unique n-grams:     143

Sample 2:
Total n-grams:            445
Total unique n-grams:     175
Lensing Window: [4, 6]
Reference Happiness: 5.354
Sample 1:
Total n-grams:            223
Total unique n-grams:     143

Sample 2:
Total n-grams:          1,509
Total unique n-grams:     415
Lensing Window: [4, 6]
Reference Happiness: 5.354
Sample 1:
Total n-grams:            223
Total unique n-grams:     143

Sample 2:
Total n-grams:            172
Total unique n-grams:     104
Lensing Window: [4, 6]
Reference Happiness: 5.354
Sample 1:
Total n-grams:            223
Total unique n-grams:     143

Sample 2:
Total n-grams:          1,051
Total unique n-grams:     320
Lensing Window: [4, 6]
Reference Happiness: 5.354
Sample 1:
Total n-grams:            223
Total unique n-grams:     143

Sample 2:
Total n-grams:            787
Total unique n-grams:     300
Lensing Window: [4, 6]
Reference Happiness: 5.354
Sample 1:
Total n-grams:            223
Total unique n-grams:     143

Sample 2:
Total n-grams:            146
Total unique n-grams:      93
Lensing Window: [4, 6]
Reference Happiness: 5.354
Sample 1:
Total n-grams:            223
Total unique n-grams:     143

Sample 2:
Total n-grams:            266
Total unique n-grams:     113
Lensing Window: [4, 6]
Reference Happiness: 5.354
Sample 1:
Total n-grams:            223
Total unique n-grams:     143

Sample 2:
Total n-grams:          8,551
Total unique n-grams:   1,314
Lensing Window: [4, 6]
Reference Happiness: 5.354
Sample 1:
Total n-grams:          3,063
Total unique n-grams:     641

Sample 2:
Total n-grams:            445
Total unique n-grams:     175
Lensing Window: [4, 6]
Reference Happiness: 5.418
Sample 1:
Total n-grams:          3,063
Total unique n-grams:     641

Sample 2:
Total n-grams:          1,509
Total unique n-grams:     415
Lensing Window: [4, 6]
Reference Happiness: 5.418
Sample 1:
Total n-grams:          3,063
Total unique n-grams:     641

Sample 2:
Total n-grams:            172
Total unique n-grams:     104
Lensing Window: [4, 6]
Reference Happiness: 5.418
Sample 1:
Total n-grams:          3,063
Total unique n-grams:     641

Sample 2:
Total n-grams:          1,051
Total unique n-grams:     320
Lensing Window: [4, 6]
Reference Happiness: 5.418
Sample 1:
Total n-grams:          3,063
Total unique n-grams:     641

Sample 2:
Total n-grams:            787
Total unique n-grams:     300
Lensing Window: [4, 6]
Reference Happiness: 5.418
Sample 1:
Total n-grams:          3,063
Total unique n-grams:     641

Sample 2:
Total n-grams:            146
Total unique n-grams:      93
Lensing Window: [4, 6]
Reference Happiness: 5.418
Sample 1:
Total n-grams:          3,063
Total unique n-grams:     641

Sample 2:
Total n-grams:            266
Total unique n-grams:     113
Lensing Window: [4, 6]
Reference Happiness: 5.418
Sample 1:
Total n-grams:          3,063
Total unique n-grams:     641

Sample 2:
Total n-grams:          8,551
Total unique n-grams:   1,314
Lensing Window: [4, 6]
Reference Happiness: 5.418
Sample 1:
Total n-grams:            445
Total unique n-grams:     175

Sample 2:
Total n-grams:          1,509
Total unique n-grams:     415
Lensing Window: [4, 6]
Reference Happiness: 5.505
Sample 1:
Total n-grams:            445
Total unique n-grams:     175

Sample 2:
Total n-grams:            172
Total unique n-grams:     104
Lensing Window: [4, 6]
Reference Happiness: 5.505
Sample 1:
Total n-grams:            445
Total unique n-grams:     175

Sample 2:
Total n-grams:          1,051
Total unique n-grams:     320
Lensing Window: [4, 6]
Reference Happiness: 5.505
Sample 1:
Total n-grams:            445
Total unique n-grams:     175

Sample 2:
Total n-grams:            787
Total unique n-grams:     300
Lensing Window: [4, 6]
Reference Happiness: 5.505
Sample 1:
Total n-grams:            445
Total unique n-grams:     175

Sample 2:
Total n-grams:            146
Total unique n-grams:      93
Lensing Window: [4, 6]
Reference Happiness: 5.505
Sample 1:
Total n-grams:            445
Total unique n-grams:     175

Sample 2:
Total n-grams:            266
Total unique n-grams:     113
Lensing Window: [4, 6]
Reference Happiness: 5.505
Sample 1:
Total n-grams:            445
Total unique n-grams:     175

Sample 2:
Total n-grams:          8,551
Total unique n-grams:   1,314
Lensing Window: [4, 6]
Reference Happiness: 5.505
Sample 1:
Total n-grams:          1,509
Total unique n-grams:     415

Sample 2:
Total n-grams:            172
Total unique n-grams:     104
Lensing Window: [4, 6]
Reference Happiness: 5.400
Sample 1:
Total n-grams:          1,509
Total unique n-grams:     415

Sample 2:
Total n-grams:          1,051
Total unique n-grams:     320
Lensing Window: [4, 6]
Reference Happiness: 5.400
Sample 1:
Total n-grams:          1,509
Total unique n-grams:     415

Sample 2:
Total n-grams:            787
Total unique n-grams:     300
Lensing Window: [4, 6]
Reference Happiness: 5.400
Sample 1:
Total n-grams:          1,509
Total unique n-grams:     415

Sample 2:
Total n-grams:            146
Total unique n-grams:      93
Lensing Window: [4, 6]
Reference Happiness: 5.400
Sample 1:
Total n-grams:          1,509
Total unique n-grams:     415

Sample 2:
Total n-grams:            266
Total unique n-grams:     113
Lensing Window: [4, 6]
Reference Happiness: 5.400
Sample 1:
Total n-grams:          1,509
Total unique n-grams:     415

Sample 2:
Total n-grams:          8,551
Total unique n-grams:   1,314
Lensing Window: [4, 6]
Reference Happiness: 5.400
Sample 1:
Total n-grams:            172
Total unique n-grams:     104

Sample 2:
Total n-grams:          1,051
Total unique n-grams:     320
Lensing Window: [4, 6]
Reference Happiness: 5.531
Sample 1:
Total n-grams:            172
Total unique n-grams:     104

Sample 2:
Total n-grams:            787
Total unique n-grams:     300
Lensing Window: [4, 6]
Reference Happiness: 5.531
Sample 1:
Total n-grams:            172
Total unique n-grams:     104

Sample 2:
Total n-grams:            146
Total unique n-grams:      93
Lensing Window: [4, 6]
Reference Happiness: 5.531
Sample 1:
Total n-grams:            172
Total unique n-grams:     104

Sample 2:
Total n-grams:            266
Total unique n-grams:     113
Lensing Window: [4, 6]
Reference Happiness: 5.531
Sample 1:
Total n-grams:            172
Total unique n-grams:     104

Sample 2:
Total n-grams:          8,551
Total unique n-grams:   1,314
Lensing Window: [4, 6]
Reference Happiness: 5.531
Sample 1:
Total n-grams:          1,051
Total unique n-grams:     320

Sample 2:
Total n-grams:            787
Total unique n-grams:     300
Lensing Window: [4, 6]
Reference Happiness: 5.314
Sample 1:
Total n-grams:          1,051
Total unique n-grams:     320

Sample 2:
Total n-grams:            146
Total unique n-grams:      93
Lensing Window: [4, 6]
Reference Happiness: 5.314
Sample 1:
Total n-grams:          1,051
Total unique n-grams:     320

Sample 2:
Total n-grams:            266
Total unique n-grams:     113
Lensing Window: [4, 6]
Reference Happiness: 5.314
Sample 1:
Total n-grams:          1,051
Total unique n-grams:     320

Sample 2:
Total n-grams:          8,551
Total unique n-grams:   1,314
Lensing Window: [4, 6]
Reference Happiness: 5.314
Sample 1:
Total n-grams:            787
Total unique n-grams:     300

Sample 2:
Total n-grams:            146
Total unique n-grams:      93
Lensing Window: [4, 6]
Reference Happiness: 5.269
Sample 1:
Total n-grams:            787
Total unique n-grams:     300

Sample 2:
Total n-grams:            266
Total unique n-grams:     113
Lensing Window: [4, 6]
Reference Happiness: 5.269
Sample 1:
Total n-grams:            787
Total unique n-grams:     300

Sample 2:
Total n-grams:          8,551
Total unique n-grams:   1,314
Lensing Window: [4, 6]
Reference Happiness: 5.269
Sample 1:
Total n-grams:            146
Total unique n-grams:      93

Sample 2:
Total n-grams:            266
Total unique n-grams:     113
Lensing Window: [4, 6]
Reference Happiness: 5.194
Sample 1:
Total n-grams:            146
Total unique n-grams:      93

Sample 2:
Total n-grams:          8,551
Total unique n-grams:   1,314
Lensing Window: [4, 6]
Reference Happiness: 5.194
Sample 1:
Total n-grams:            266
Total unique n-grams:     113

Sample 2:
Total n-grams:          8,551
Total unique n-grams:   1,314
Lensing Window: [4, 6]
Reference Happiness: 5.556